Trends in Data Science & Business Analytics
  • Home
  • Data Cleaning & Exploration
    • Data Cleaning
    • Exploratory Data Analysis
    • Skill Gap Analysis
  • Machine Learning Methods
    • Supervised Machine Learning
    • Unsupervised Machine Learning

Exploratory Data Analytics

Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")
Code
# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
            'Data Science', 'Data Analysis','Data Analytics',  'Market Research Analyst' 
            'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
            'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']

match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)

eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
             | match('SKILLS_NAME') \
             | match('SPECIALIZED_SKILLS_NAME') 
eda['DATA_ANALYST_JOB'].value_counts()
DATA_ANALYST_JOB
False    38212
True     33042
Name: count, dtype: int64
Code
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_grouped = (
    eda
    .groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
    .size()
    .reset_index(name='Job_Count')
)

short_names = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})

pivot = (
    df_grouped
    .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
    .reset_index()
)
industries = pivot['Industry'].tolist()
y_true  = pivot['True'].tolist()
y_false = pivot['False'].tolist()

# -----------------------------------------------------------------------------
# 2) Build a 2-row subplot: bar on top, table below
# -----------------------------------------------------------------------------
fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.70, 0.30],           # give a bit more room to the table
    specs=[[{"type":"bar"}],[{"type":"table"}]],
    vertical_spacing=0.12              # more space between bar and table
)

colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}

fig.add_trace(
    go.Bar(
        x=industries, y=y_true, name='True',
        marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
        text=y_true, textposition='outside'
    ),
    row=1, col=1
)
fig.add_trace(
    go.Bar(
        x=industries, y=y_false, name='False',
        marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
        text=y_false, textposition='outside'
    ),
    row=1, col=1
)


# -----------------------------------------------------------------------------
# 3) Slider steps: 0 → 8 000 in 200s
# -----------------------------------------------------------------------------
steps = []
for val in range(0, 8001, 200):
    steps.append(dict(
        label=str(val),
        method="update",
        args=[
            {"y": [
                [v if v>=val else 0 for v in y_true],
                [v if v>=val else 0 for v in y_false]
            ]},
            {"title": f"Min Jobs ≥ {val:,}"}
        ]
    ))

# -----------------------------------------------------------------------------
# 4) Final layout tweaks
# -----------------------------------------------------------------------------
fig.update_layout(
    # lift slider above everything
    sliders=[dict(
        active=0,
        currentvalue={"prefix":"Min Jobs: "},
        pad={"b":0},
        x=0.05,
        y=1.05,                # move slider way above the plot area
        xanchor="left",
        yanchor="bottom",
        len=0.7,
        font=dict(color='#A81D1D'),
        steps=steps
    )],

    title=dict(
        text="Data & Business Analytics Job Trends",
        font=dict(size=24, color='#A81D1D'),
        x=0.5,
        y=0.95,                # drop the title just below the slider
        xanchor="center",
        yanchor="top"
    ),

    width=1100, height=850,
    margin=dict(l=60, r=60, t=180, b=200),  # extra top & bottom margin

    plot_bgcolor='white',
    paper_bgcolor='white',

    xaxis=dict(
        title="Industry",
        title_font=dict(size=16, color='#A81D1D'),
        tickmode='array',
        tickvals=list(range(len(industries))),
        ticktext=industries,
        tickangle=-30,
        tickfont=dict(size=11, color='#333'),
        showline=True, linecolor='#A81D1D'
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16, color='#A81D1D'),
        tickfont=dict(size=11, color='#333'),
        gridcolor='rgba(200,200,200,0.3)',
        showline=True, linecolor='#A81D1D',
        range=[0, max(max(y_true),max(y_false))*1.2]
    ),

    legend=dict(
        title="Data Analyst Job",
        title_font=dict(color='#A81D1D'),
        font=dict(size=12),
        x=0.95, y=0.95
    ),

    bargap=0.2
)

fig.write_html(
    "figures/edaplot1.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Create the box plot
fig = px.box(df, 
             x='REMOTE_TYPE_NAME', 
             y='SALARY', 
             color='Job_Category',
             title='Salary Distribution by Remote Type for Analytics vs Non-Analytics Jobs',
             labels={'REMOTE_TYPE_NAME': 'Remote Type', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
             color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Remote Type",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

fig.write_html(
    "figures/edaplot2.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Group by industry and job category
df_grouped = df.groupby(['NAICS2_NAME', 'IS_ANALYTICS_JOB']).size().reset_index(name='Job_Count')
df_grouped['Job_Category'] = df_grouped['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Shorten industry names for better readability
short_map = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])

# Create the stacked bar chart
fig = px.bar(df_grouped, 
             x='Industry', 
             y='Job_Count', 
             color='Job_Category',
             title='Top Industries Hiring Analytics Jobs',
             labels={'Industry': 'Industry', 'Job_Count': 'Number of Jobs', 'Job_Category': 'Job Category'},
             barmode='stack',
             color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=1000,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Industry",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        tickangle=-45,
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True
    ),
    yaxis=dict(
        title="Number of Jobs",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

fig.write_html(
    "figures/edaplot3.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.express as px
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2

# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])

# Create the scatter plot with trend line
fig = px.scatter(df, 
                 x='Avg_Years_Experience', 
                 y='SALARY', 
                 color='Job_Category',
                 trendline='ols',  # Add trend line (ordinary least squares)
                 title='Experience Requirements vs Salary for Analytics Jobs',
                 labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
                 color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    xaxis=dict(
        title="Average Years of Experience",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    yaxis=dict(
        title="Salary ($)",
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor="#E2E8F0",
        linecolor="#2D3748",
        linewidth=2,
        showline=True,
        showgrid=True,
        zeroline=False
    ),
    legend=dict(
        title="Job Category",
        font=dict(size=13),
        bgcolor="#FFFFFF",
        bordercolor="#FF6B6B",  # Red border for theme
        borderwidth=1,
        x=1.02,
        y=0.5,
        xanchor="left",
        yanchor="middle"
    ),
    hovermode="closest",
    hoverlabel=dict(
        bgcolor="#FFFFFF",
        font_size=12,
        font_family="Inter, sans-serif",
        font_color="#2D3748",
        bordercolor="#FF6B6B"  # Red border for hover
    )
)

# Customize scatter points
fig.update_traces(
    marker=dict(
        size=8,
        opacity=0.7,
        line=dict(width=1, color="#2D3748")
    )
)

fig.write_html(
    "figures/edaplot4.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)
Code
import plotly.graph_objects as go
import pandas as pd

# Prepare the data
df = eda.copy()

# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
    if row['DATA_ANALYST_JOB']:
        return True
    title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
    if 'business analyst' in title:
        return True
    return False

df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})

# Filter for Analytics jobs only
df_analytics = df[df['IS_ANALYTICS_JOB']].copy()

# Clean the data (remove rows with missing industry)
df_analytics = df_analytics.dropna(subset=['NAICS2_NAME'])

# Group by job category and industry to get job counts
df_grouped = df_analytics.groupby(['Job_Category', 'NAICS2_NAME']).size().reset_index(name='Job_Count')

# Shorten industry names for better readability
short_map = {
    'Professional, Scientific, and Technical Services': 'Prof. Services',
    'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
    'Health Care and Social Assistance': 'Healthcare',
    'Finance and Insurance': 'Finance',
    'Information': 'Info Tech',
    'Educational Services': 'Education',
    'Manufacturing': 'Manufacturing',
    'Retail Trade': 'Retail',
    'Accommodation and Food Services': 'Hospitality',
    'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['NAICS2_NAME'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])

# Prepare data for Sankey Diagram
# Create a list of unique labels (nodes)
labels = list(df_grouped['Job_Category'].unique()) + list(df_grouped['NAICS2_NAME'].unique())

# Create source and target indices
source = [labels.index(job_cat) for job_cat in df_grouped['Job_Category']]
target = [labels.index(industry) for industry in df_grouped['NAICS2_NAME']]
value = df_grouped['Job_Count'].tolist()

# Create the Sankey Diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="#2D3748", width=0.5),
        label=labels,
        color="#FF6B6B"  # Red nodes for the theme
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color="rgba(255, 107, 107, 0.5)"  # Semi-transparent red links
    )
)])

# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
    width=900,
    height=600,
    plot_bgcolor='#FFFFFF',  # Plain white background
    paper_bgcolor='#FFFFFF',  # Plain white background
    font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
    title=dict(
        text='Distribution of Analytics Job Postings by Industry',
        font=dict(size=24, color="#FF6B6B"),  # Red title for theme
        x=0.5,
        xanchor="center",
        y=0.95,
        yanchor="top"
    ),
    margin=dict(l=20, r=20, t=80, b=20),
)
fig.write_html(
    "figures/edaplot5.html",
    include_plotlyjs="cdn",  # Use CDN to load Plotly JS
    full_html=False          # Only include the plot div
)